Applying our zero-inflated model to the cortical data (L5 only).
We select only the cells that pass QC and that were clustered (no -1 labels), color coded by the cluster labels. I will focus on the top 1,000 most variable genes. Note that the data are not public, hence it will work only on Davide’s system (for now).
data("cortical")
labels <- read.table("~/git/brainData/analysis/results_160705/cluster_labels.txt", stringsAsFactors = FALSE, sep='\t', comment.char = "%")
l5_data <- cortical[, rownames(labels[labels$ClusterMergedLabel!="-1",])]
filter <- apply(assay(l5_data)>10, 1, sum)>=10
Number of retained genes:
print(sum(filter))
## [1] 8036
To speed up the computations, I will focus on the top 1,000 most variable genes.
l5_data <- l5_data[filter,]
core <- assay(l5_data)
vars <- rowVars(log1p(core))
names(vars) <- rownames(core)
vars <- sort(vars, decreasing = TRUE)
core <- core[names(vars)[1:1000],]
First, let’s look at PCA (of the log counts) for reference.
colMerged <- labels[labels$ClusterMergedLabel!="-1", "ClusterMergedColor"]
detection_rate <- colSums(core>0)
coverage <- colSums(core)
pca <- prcomp(t(log1p(core)))
plot(pca$x, col=colMerged, pch=19, main="PCA of log-counts, centered not scaled")
df <- data.frame(PC1=pca$x[,1], PC2=pca$x[,2], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## PC1 PC2 detection_rate coverage
## PC1 1.0000000 0.156529233 -0.797149452 -0.8500634
## PC2 0.1565292 1.000000000 -0.001071276 -0.2898612
## detection_rate -0.7971495 -0.001071276 1.000000000 0.4682109
## coverage -0.8500634 -0.289861237 0.468210942 1.0000000
Now, let’s see how ZINB works.
print(system.time(zinb_Vall <- zinbFit(core, ncores = 3, K = 2)))
## user system elapsed
## 123.079 26.572 65.975
plot(zinb_Vall@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_Vall@W[,1], W2=zinb_Vall@W[,2], gamma_mu = zinb_Vall@gamma_mu[1,], gamma_pi = zinb_Vall@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.0000000 -0.37088778 0.5808957 -0.68002463
## W2 -0.3708878 1.00000000 -0.2975745 0.01314882
## gamma_mu 0.5808957 -0.29757451 1.0000000 -0.28499939
## gamma_pi -0.6800246 0.01314882 -0.2849994 1.00000000
## detection_rate 0.7381200 -0.05866148 0.4529805 -0.97174374
## coverage 0.5464538 -0.38472669 0.9552966 -0.32764896
## detection_rate coverage
## W1 0.73811996 0.5464538
## W2 -0.05866148 -0.3847267
## gamma_mu 0.45298050 0.9552966
## gamma_pi -0.97174374 -0.3276490
## detection_rate 1.00000000 0.4682109
## coverage 0.46821094 1.0000000
batch <- droplevels(colData(l5_data)$MD_c1_run_id)
mod <- model.matrix(~batch)
boxplot(df$W1~batch)
boxplot(df$W2~batch)
print(system.time(zinb_X <- zinbFit(core, ncores = 3, K = 2, X=mod)))
## user system elapsed
## 322.892 56.209 140.675
plot(zinb_X@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_X@W[,1], W2=zinb_X@W[,2], gamma_mu = zinb_X@gamma_mu[1,], gamma_pi = zinb_X@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi detection_rate
## W1 1.0000000 -0.4285219 0.5895325 -0.5957512 0.6873644
## W2 -0.4285219 1.0000000 -0.4851126 0.1285883 -0.1901410
## gamma_mu 0.5895325 -0.4851126 1.0000000 -0.3227129 0.4838874
## gamma_pi -0.5957512 0.1285883 -0.3227129 1.0000000 -0.9544406
## detection_rate 0.6873644 -0.1901410 0.4838874 -0.9544406 1.0000000
## coverage 0.5129822 -0.5692522 0.9523981 -0.3437921 0.4682109
## coverage
## W1 0.5129822
## W2 -0.5692522
## gamma_mu 0.9523981
## gamma_pi -0.3437921
## detection_rate 0.4682109
## coverage 1.0000000
boxplot(df$W1~batch)
boxplot(df$W2~batch)
qc <- as.matrix(colData(l5_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
plot(df$W1~qcpca$x[,1], pch=19, col=colMerged)
plot(df$W2~qcpca$x[,1], pch=19, col=colMerged)
print(system.time(zinb_q <- zinbFit(core, ncores = 3, K = 2, X=mod)))
## user system elapsed
## 372.457 61.163 163.809
plot(zinb_q@W, col=colMerged, pch=19, xlab="W1", ylab="W2", main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_q@W[,1], W2=zinb_q@W[,2], gamma_mu = zinb_q@gamma_mu[1,], gamma_pi = zinb_q@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 gamma_mu gamma_pi
## W1 1.0000000 0.182434988 0.194849547 -0.3957306
## W2 0.1824350 1.000000000 0.003560832 -0.1673526
## gamma_mu 0.1948495 0.003560832 1.000000000 -0.3569265
## gamma_pi -0.3957306 -0.167352629 -0.356926451 1.0000000
## detection_rate 0.4379676 0.180685646 0.502168620 -0.9535241
## coverage 0.1098789 0.005351177 0.953320973 -0.3530355
## detection_rate coverage
## W1 0.4379676 0.109878881
## W2 0.1806856 0.005351177
## gamma_mu 0.5021686 0.953320973
## gamma_pi -0.9535241 -0.353035509
## detection_rate 1.0000000 0.468210942
## coverage 0.4682109 1.000000000
boxplot(df$W1~batch)
boxplot(df$W2~batch)
print(system.time(zinb_3 <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 401.602 66.262 173.422
pairs(zinb_3@W, col=colMerged, pch=19, main="ZINB")
#total number of detected genes in the cell
df <- data.frame(W1=zinb_3@W[,1], W2=zinb_3@W[,2], W3=zinb_3@W[,3], gamma_mu = zinb_3@gamma_mu[1,], gamma_pi = zinb_3@gamma_pi[1,], detection_rate=detection_rate, coverage=coverage)
pairs(df, pch=19, col=colMerged)
print(cor(df, method="spearman"))
## W1 W2 W3 gamma_mu
## W1 1.00000000 -0.33654363 -0.019749590 -0.217045507
## W2 -0.33654363 1.00000000 -0.013168732 0.045486125
## W3 -0.01974959 -0.01316873 1.000000000 0.003234138
## gamma_mu -0.21704551 0.04548612 0.003234138 1.000000000
## gamma_pi 0.47335254 -0.17336272 0.108652182 -0.355591805
## detection_rate -0.51908169 0.19423094 -0.108389528 0.509108822
## coverage -0.12447772 0.06739511 0.013016634 0.952445360
## gamma_pi detection_rate coverage
## W1 0.4733525 -0.5190817 -0.12447772
## W2 -0.1733627 0.1942309 0.06739511
## W3 0.1086522 -0.1083895 0.01301663
## gamma_mu -0.3555918 0.5091088 0.95244536
## gamma_pi 1.0000000 -0.9497425 -0.34834942
## detection_rate -0.9497425 1.0000000 0.46821094
## coverage -0.3483494 0.4682109 1.00000000
noglia_data <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m4", "m5")),])]
filter <- apply(assay(noglia_data)>10, 1, sum)>=10
noglia <- assay(noglia_data[filter,])
colNoglia <- labels[!(labels$ClusterMergedLabel %in% c("-1", "m4", "m5")), "ClusterMergedColor"]
vars <- rowVars(log1p(noglia))
names(vars) <- rownames(noglia)
vars <- sort(vars, decreasing = TRUE)
noglia <- noglia[names(vars)[1:1000],]
batch <- droplevels(colData(noglia_data)$MD_c1_run_id)
qc <- as.matrix(colData(noglia_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
print(system.time(zinb_noglia <- zinbFit(noglia, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 402.567 33.575 169.758
pairs(zinb_noglia@W, col=colNoglia, pch=19, main="ZINB")
nopink_data <- cortical[,rownames(labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")),])]
filter <- apply(assay(nopink_data)>10, 1, sum)>=10
nopink <- assay(nopink_data[filter,])
colNopink <- labels[!(labels$ClusterMergedLabel %in% c("-1", "m2", "m4", "m5", "m6")), "ClusterMergedColor"]
vars <- rowVars(log1p(nopink))
names(vars) <- rownames(nopink)
vars <- sort(vars, decreasing = TRUE)
nopink <- nopink[names(vars)[1:1000],]
batch <- droplevels(colData(nopink_data)$MD_c1_run_id)
qc <- as.matrix(colData(nopink_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
print(system.time(zinb_nopink <- zinbFit(nopink, ncores = 3, K = 3, X=mod)))
## user system elapsed
## 370.877 55.012 159.651
pairs(zinb_nopink@W, col=colNopink, pch=19, main="ZINB")
core <- assay(l5_data)
batch <- droplevels(colData(l5_data)$MD_c1_run_id)
qc <- as.matrix(colData(l5_data)[,1:14])
qcpca <- prcomp(qc, center=TRUE, scale.=TRUE)
mod <- model.matrix(~batch + qcpca$x[,1:2])
print(system.time(zinb_all <- zinbFit(core, ncores = 3, K = 3, X=mod)))
## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value
## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value
## Warning in optimize(f = zinb.loglik.dispersion, Y = Y, mu = mu, logitPi =
## logitPi, : NA/Inf replaced by maximum positive value
## user system elapsed
## 3408.537 501.766 1541.282
pairs(zinb_all@W, col=colMerged, pch=19, main="ZINB")